home *** CD-ROM | disk | FTP | other *** search
- #
- # HTMLEncoding.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- # IMPORTANT: both HTMLEncodingExtractor and HTMLFormatter can have only a single instance!
-
- from sgmllib import SGMLParser
- import threading # to ensure thread-safety, since we will have only one
- # global HTMLEncodingExtractor and HTMLFormatter
-
- from consts import *
- from utilities import *
- import htmlentitydefs
-
-
- _metaCharsetPat = re.compile(r'charset\s*=\s*([^\s"\'>]*)')
-
- # there is no danger of reentrant locking, so we don't use RLock here
- _htmlEncodingExtractorLock = threading.Lock()
- _htmlFormatterLock = threading.Lock()
-
-
- class HTMLEncodingExtractor (SGMLParser):
- """
- A SGMLParser-derived parser to extract web page encoding
- --------------------------------------------------------
- hasTagHTML: True iff the given src has <HTML> tag
- hasTagHead: True iff the given src has <HEAD> tag
-
- Call exctract(src) to extract the encoding.
- """
- def reset (self):
- SGMLParser.reset(self)
- self.encoding = None
- self._tagStack = []
- self.hasTagHTML = self.hasTagHead = False
-
- def unknown_starttag(self, tag, attributes):
- if tag == 'html': self.hasTagHTML = True
- elif tag == 'head': self.hasTagHead = True
- self._tagStack.append(tag)
- def unknown_endtag(self, tag):
- if len(self._tagStack) and self._tagStack[-1] == tag:
- del self._tagStack[-1:]
-
- def do_meta (self, attrs):
- if len(self._tagStack) >= 2:
- if self._tagStack[-1] == 'head' and self._tagStack[-2] == 'html':
- self._extractEncoding(attrs)
- elif len(self._tagStack) == 1 and self._tagStack[-1] == 'head':
- self._extractEncoding(attrs)
- elif len(self._tagStack) == 0:
- self._extractEncoding(attrs)
-
- def _extractEncoding (self, attrs):
- attrDict=dict(attrs)
- httpEquiv = attrDict.get('http-equiv')
- if httpEquiv and httpEquiv.lower() == 'content-type':
- content = attrDict['content']
- if content:
- mo = _metaCharsetPat.search(content)
- if mo:
- self.encoding = mo.group(1)
- self.setnomoretags()
-
- def extract (self, htmlSrc):
- """Extract the encoding from htmlSrc; returns the encoding (could be None)"""
- # thread-safety: multiple threads may call extract() and access the ivars simultaneously
- _htmlEncodingExtractorLock.acquire()
-
- try:
- self.reset()
- self.feed(htmlSrc)
- self.close()
- encoding = self.encoding
-
- except Exception, e:
- printException(u'Exception in HTMLEncodingExtractor.extract()', e)
- encoding = None
-
- _htmlEncodingExtractorLock.release()
-
- return encoding
-
-
- class HTMLFormatter (SGMLParser):
- """
- A SGMLParser-derived parser to rewrite web page encoding into utf-8
- -------------------------------------------------------------------
- Call format(src) to get the modified HTML src.
- """
- def reset(self):
- # extend (called by SGMLParser.__init__)
- SGMLParser.reset(self)
- self._pieces = []
- self._tagStack = []
- self.encoding = None
- self.insertCharset = 0
-
- def unknown_starttag(self, tag, attrs):
- # called for each start tag
- # attrs is a list of (attr, value) tuples
- # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
- # Ideally we would like to reconstruct original tag and attributes, but
- # we may end up quoting attribute values that weren't quoted in the source
- # document, or we may change the type of quotes around the attribute value
- # (single to double quotes).
- # Note that improperly embedded non-HTML code (like client-side Javascript)
- # may be parsed incorrectly by the ancestor, causing runtime script errors.
- # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
- # to ensure that it will pass through this parser unaltered (in handle_comment).
- strattrs = ''.join([' %s="%s"' % (key, value) for key, value in attrs])
- self._pieces.append('<%(tag)s%(strattrs)s>' % locals())
- if tag == 'html':
- if self.insertCharset == 2:
- self._pieces.append('\n<head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>')
- elif tag == 'head':
- if self.insertCharset == 3:
- self._pieces.append('\n<meta http-equiv="content-type" content="text/html; charset=utf-8">')
-
- self._tagStack.append(tag)
-
- def unknown_endtag(self, tag):
- # called for each end tag, e.g. for </pre>, tag will be "pre"
- # Reconstruct the original end tag.
- self._pieces.append('</%(tag)s>' % locals())
- if len(self._tagStack) and self._tagStack[-1] == tag:
- del self._tagStack[-1:]
-
- def do_meta (self, attrs):
- if len(self._tagStack) >= 2:
- if self._tagStack[-1] == 'head' and self._tagStack[-2] == 'html':
- self._rewriteEncoding(attrs)
- else:
- self.unknown_starttag('meta', attrs)
- elif len(self._tagStack) == 1 and self._tagStack[-1] == 'head':
- self._rewriteEncoding(attrs)
- elif len(self._tagStack) == 0:
- self._rewriteEncoding(attrs)
- else:
- self.unknown_starttag('meta', attrs)
-
- def _rewriteEncoding (self, attrs):
- attrDict = dict(attrs)
- httpEquiv = attrDict.get('http-equiv')
- if httpEquiv and httpEquiv.lower() == 'content-type':
- content = attrDict['content']
- if content:
- mo = _metaCharsetPat.search(content)
- if mo:
- self.encoding = mo.group(1)
- attrDict['content'] = '%sutf-8%s' % (content[:mo.start(1)], content[mo.end(1):])
-
- strattrs = ''.join([' %s="%s"' % (key, value) for key, value in attrDict.items()])
- self._pieces.append('<meta%s>' % strattrs)
- # NOTE we do not push 'meta' into _tagStack - cuz multiple meta would sabotage parsing!
-
- def handle_charref(self, ref):
- # called for each character reference, e.g. for " ", ref will be "160"
- # Reconstruct the original character reference.
- self._pieces.append('%(ref)s;' % locals())
-
- def handle_entityref(self, ref):
- # called for each entity reference, e.g. for "©", ref will be "copy"
- # Reconstruct the original entity reference.
- self._pieces.append('&%(ref)s' % locals())
- # standard HTML entities are closed with a semicolon; other entities are not
- if htmlentitydefs.entitydefs.has_key(ref):
- self._pieces.append(';')
-
- def handle_data(self, text):
- # called for each block of plain text, i.e. outside of any tag and
- # not containing any character or entity references
- # Store the original text verbatim.
- self._pieces.append(text)
-
- def handle_comment(self, text):
- # called for each HTML comment, e.g. <!-- insert Javascript code here -->
- # Reconstruct the original comment.
- # It is especially important that the source document enclose client-side
- # code (like Javascript) within comments so it can pass through this
- # processor undisturbed; see comments in unknown_starttag for details.
- self._pieces.append('<!--%(text)s-->' % locals())
-
- def handle_pi(self, text):
- # called for each processing instruction, e.g. <?instruction>
- # Reconstruct original processing instruction.
- self._pieces.append('<?%(text)s>' % locals())
-
- def handle_decl(self, text):
- # called for the DOCTYPE, if present, e.g.
- # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- # "http://www.w3.org/TR/html4/loose.dtd">
- # Reconstruct original DOCTYPE
- self._pieces.append('<!%(text)s>' % locals())
-
- def format(self, src, insertCharset, hasTagHTML, hasTagHead):
- """Return formatted HTML as a single string"""
- # thread-safety: multiple threads may call extract() and access the ivars simultaneously
- _htmlFormatterLock.acquire()
-
- try:
- self.reset()
-
- # self.insertCharset
- # 0: don't insert charset meta tag
- # 1: insert on the top
- # 2: insert right after <HTML>
- # 3: insert right after <HEAD>
-
- if insertCharset:
- if hasTagHead: self.insertCharset = 3
- elif hasTagHTML: self.insertCharset = 2
- else: self.insertCharset = 1
-
- self.feed(src)
- self.close()
-
- if self.insertCharset == 1:
- ret = '<meta http-equiv="content-type" content="text/html; charset=utf-8">\n%s' % ''.join(self._pieces)
- else:
- ret = ''.join(self._pieces)
-
- except Exception, e:
- printException(u'Exception in HTMLFormatter.extract()', e)
- ret = ''
-
- _htmlFormatterLock.release()
-
- return ret
-